load data

geno <- read_delim("tables/TableS1_genotypes.tsv")
pheno <- read_delim("tables/TableS2_phenotypes_metadata.tsv")
cipro_antibiogram <- full_join(geno,pheno)

set colours

wt_colours <- c(`non-wt (I/R)`="IndianRed", `wt (S)`= "LightBlue")
res_colours <- c("I"="grey", "R"="IndianRed", "S"="LightBlue", "NWT"="grey")
res_colours2 <- c("I"="black", "R"="IndianRed", "S"="LightBlue", "NWT"="black")
res_colours3 <- c("NWT I"="#78638a", "NWT R"="IndianRed", "WT S"="LightBlue")
# position dodge for coefficient plots
pd <- position_dodge(width=0.8)

function to tabulate estimate, 95% CI, p-value from logistic regression model

logreg_details <- function(model) {
  model_summary <- cbind(est=model$coefficients, ci.lower=model$ci.lower, ci.upper=model$ci.upper, pval=model$prob) %>%
    as_tibble(rownames="Determinant")
  return(model_summary)
}

function to tabulate estimate, 95% CI, p-value from linear regression model

linreg_details <- function(model) {
  ci <- confint(model) %>% as_tibble(rownames="Determinant") %>%
    rename(ci.lower=`2.5 %`, ci.upper=`97.5 %`)
  model_summary <- summary(model)$coef %>% 
    as_tibble(rownames="Determinant") %>% 
    rename(est=Estimate, pval=`Pr(>|t|)`) %>% 
    select(Determinant, est, pval) %>% left_join(ci)
  return(model_summary)
}

functions to calculate PPV and categorical agreement from models

cat_agree <- function(pred,truth) {
  xtabs <- table(pred,factor(truth, levels = 0:1))
  return((xtabs[1,1]+xtabs[2,2])/sum(xtabs))
}

ppv <- function(pred,truth) {
  xtabs <- table(pred,truth)
  return(xtabs[2,2]/sum(xtabs[2,]))
}

npv <- function(pred,truth) {
  xtabs <- table(pred,truth)
  return(xtabs[1,1]/sum(xtabs[1,]))
}

#sens <- function(pred,truth) {
#  xtabs <- table(pred,truth)
#  return(xtabs[2,2]/(sum(xtabs[,2])))
#}

sens <- function(pred,truth) {
  xtabs <- table(pred,factor(truth, levels = 0:1))
  return(xtabs[2,2]/(sum(xtabs[,2])))
}

spec <- function(pred,truth) {
  xtabs <- table(pred,truth)
  return(xtabs[1,1]/(sum(xtabs[,1])))
}

me <- function(pred,truth) {
  xtabs <- table(pred,truth)
  return(xtabs[2,1]/sum(xtabs[,1]))
}

vme <- function(pred,truth) {
  xtabs <- table(pred,factor(truth, levels = 0:1))
  return(xtabs[1,2]/sum(xtabs[,2]))
}

metrics <- function(pred,truth) {
  xtabs <- table(pred,truth)
  return(list(cat=cat_agree(pred,truth),
              ppv=ppv(pred,truth),
              npv=npv(pred,truth),
              sens=sens(pred,truth),
              spec=spec(pred,truth),
              me=me(pred,truth),
              vme=vme(pred,truth),
              n=sum(xtabs), 
              summary=c("cat"=cat_agree(pred,truth),
                    "sens"=sens(pred,truth),
                    "spec"=spec(pred,truth),
                    "me"=me(pred,truth),
                    "vme"=vme(pred,truth),
                    "n"=sum(xtabs)
              )
          )
    )
}

Logistic regression models

Determinants vs R

# prepare matrix for modelling
R_vs_dets <- cipro_antibiogram %>% 
  select(`GyrA-83F`:`qnrVC6`, aac6, resistant) %>%
  rename(`aac(6')-Ib-cr`=aac6) %>%
  select(-c(ends_with('?'), ends_with('*'))) # remove imprecise matches

# exclude rare variants
R_vs_dets <- R_vs_dets[,colSums(R_vs_dets) >= 20]

# model R vs all common variants (N>=20) = Model 1a
model_R_indiv <- logistf(resistant ~ ., data=R_vs_dets)

# model R vs all common variants (N>=20), each with aac6 interaction = Model 1b
model_R_indiv_aac6Interaction <- logistf(resistant ~ .*`aac(6')-Ib-cr`, data=R_vs_dets)

Summarise/plot coefficients per determinant - R models

# summarise model output
logreg_model1a <- logreg_details(model_R_indiv) %>% mutate(interaction=NA) %>% mutate(model="a") %>% relocate(interaction, .after=Determinant) %>%
    mutate(Determinant=gsub("`","",Determinant)) 
logreg_model1b <- logreg_details(model_R_indiv_aac6Interaction) %>% 
  rename(Term=Determinant) %>%
  separate_wider_delim(Term, ":", names=c("Determinant","interaction"), too_few="align_start", cols_remove=F) %>% 
  mutate(model="b") %>%
  mutate(Determinant=gsub("`","",Determinant))  %>%
  mutate(Term=gsub("`","",Term))  %>%
  mutate(interaction=gsub("`","",interaction)) 

# all individual determinants, plus significant positive interactions
log_reg_R_plot <- logreg_model1a %>% bind_rows(logreg_model1b) %>%
  filter((pval<0.05 & est>0) | is.na(interaction)) %>%
  mutate(interaction=factor(replace(interaction, is.na(interaction), "none"), levels=c("none", "aac(6')-Ib-cr"))) %>%
  filter(Determinant != "(Intercept)") %>%
  ggplot(aes(y=Determinant)) + 
  geom_vline(xintercept=0) + 
  geom_linerange(aes(xmin=ci.lower, xmax=ci.upper, group=model, col=model, linetype=interaction), position=pd) +
  geom_point(aes(x=est, group=model, col=model), position=pd) + 
  scale_colour_manual(values=c(a="#f1933b", b="#bd1515")) +
  theme_bw() + labs(x="Regression coefficient", col="R Model", linetype="Interaction")

log_reg_R_plot

Determinants vs NWT

# prepare matrix for modelling
NWT_vs_dets <- cipro_antibiogram %>% 
  select(`GyrA-83F`:`qnrVC6`, aac6, nonWT_binary) %>%
  rename(`aac(6')-Ib-cr`=aac6) %>%
  select(-c(ends_with('?'), ends_with('*'))) # remove imprecise matches

# exclude genomes with rare variants
NWT_vs_dets <- NWT_vs_dets[,colSums(NWT_vs_dets) >= 20]

# model NWT vs all common variants (N>=20) = Model 3a
model_NWT_indiv <- logistf(nonWT_binary ~ ., data=NWT_vs_dets)

# model NWT vs all common variants (N>=20), each with aac6 interaction = Model 3b
model_NWT_indiv_aac6Interaction <- logistf(nonWT_binary ~ .*`aac(6')-Ib-cr`, data=NWT_vs_dets)

Summarise/plot coefficients per determinant - NWT models

# summarise model output
logreg_model3a <- logreg_details(model_NWT_indiv) %>% mutate(interaction=NA) %>% mutate(model="a") %>% relocate(interaction, .after=Determinant) %>%
    mutate(Determinant=gsub("`","",Determinant)) 
logreg_model3b <- logreg_details(model_NWT_indiv_aac6Interaction) %>% 
  rename(Term=Determinant) %>%
  separate_wider_delim(Term, ":", names=c("Determinant","interaction"), too_few="align_start", cols_remove=F) %>% 
  mutate(model="b") %>%
  mutate(Determinant=gsub("`","",Determinant))  %>%
  mutate(Term=gsub("`","",Term))  %>%
  mutate(interaction=gsub("`","",interaction)) 

# get count per marker to label on the right
marker_count <- cipro_antibiogram %>% 
  rename(`aac(6')-Ib-cr`=aac6) %>%
  select(`GyrA-83F`:`qnrVC6`, `aac(6')-Ib-cr`) %>%
  colSums()

marker_count_order <- logreg_model3a %>% bind_rows(logreg_model3b) %>%
    filter((pval<0.05 & est>0) | is.na(interaction)) %>%
    filter(Determinant != "(Intercept)") %>% pull(Determinant) %>% unique() %>% sort()

log_reg_NWT_plot <- logreg_model3a %>% bind_rows(logreg_model3b) %>%
  filter((pval<0.05 & est>0) | is.na(interaction)) %>%
  mutate(interaction=factor(replace(interaction, is.na(interaction), "none"), levels=c("none", "aac(6')-Ib-cr"))) %>%
  filter(Determinant != "(Intercept)") %>%
  ggplot(aes(y=Determinant)) + 
  geom_vline(xintercept=0) + 
  geom_linerange(aes(xmin=ci.lower, xmax=ci.upper, group=model, col=model, linetype=interaction), position=pd) +
  geom_point(aes(x=est, group=model, col=model), position=pd) + 
  scale_colour_manual(values=c(a="#70bfef", b="#2c15ae")) +
  theme_bw() + labs(x="Regression coefficient", col="NWT Model", linetype="Interaction", y="") + 
  scale_y_discrete(labels=paste0("n=",marker_count[marker_count_order]), position="right")

log_reg_NWT_plot

TableS6 - write out logistic regression coefficients

multiVariable_vs_singleVariable <- logreg_model1a %>% select(-c(interaction, model)) %>%
  setNames(paste0('R Model1a.', names(.))) %>% rename(Term='R Model1a.Determinant') %>% 
  full_join(logreg_model1b %>% select(-c(Determinant, interaction, model)) %>% setNames(paste0('R Model1b.', names(.)))%>% rename(Term='R Model1b.Term'), by="Term") %>% 
  full_join(logreg_model3a %>% select(-c(interaction, model)) %>% setNames(paste0('NWT Model1a.', names(.))) %>% rename(Term='NWT Model1a.Determinant'), by="Term") %>% 
  full_join(logreg_model3b %>% select(-c(Determinant, interaction, model)) %>% setNames(paste0('NWT Model1b.', names(.)))%>% rename(Term='NWT Model1b.Term'), by="Term")

write_csv(multiVariable_vs_singleVariable, file="tables/TableS6_LogRegCoef.csv")

read in Table 1, solo PPVs

ppv_solo <- read_csv("tables/TableS5_determinantStats.csv") %>% filter(!is.na(N.solo)) %>% select(Determinant, ends_with(".solo"))

ppvR <- ppv_solo %>% mutate(p=R.solo/N.solo, se=sqrt(p*(1-p)/N.solo), ci.lower=p-1.96*se, ci.upper=p+1.96*se) %>% select(Determinant, p, ci.lower, ci.upper) %>% mutate(category="R") 

ppvNWT <- ppv_solo %>% mutate(p=NWT.solo/N.solo, se=sqrt(p*(1-p)/N.solo), ci.lower=p-1.96*se, ci.upper=p+1.96*se) %>% select(Determinant, p, ci.lower, ci.upper, N.solo) %>% mutate(category="NWT")

ppvR_plot <- ppvR %>% 
  ggplot(aes(y=Determinant)) +
  geom_vline(xintercept=50, linetype=2) +
  geom_linerange(aes(xmin=ci.lower*100, xmax=ci.upper*100), col="#bd1515", position=pd) +
  geom_point(aes(x=p*100, group=category), col="#bd1515", position=pd) + 
  theme_bw() + labs(x="PPV (%)")

ppvNWT_plot <- ppvNWT %>% 
  ggplot(aes(y=Determinant)) +
  geom_vline(xintercept=50, linetype=2) +
  geom_linerange(aes(xmin=ci.lower*100, xmax=ci.upper*100), col="#2c15ae", position=pd) +
  geom_point(aes(x=p*100, group=category), col="#2c15ae", position=pd) + 
  theme_bw() + labs(x="PPV (%)", y="") + 
  scale_y_discrete(labels=paste0("n=",ppvNWT$N.solo), position="right")

Figure 1 - PPV and regression model coefficients, for individual determinants

(ppvR_plot + ggtitle(label="a) PPV for solo marker", subtitle="R") + ppvNWT_plot + ggtitle(label="", subtitle="NWT") + plot_layout(guides="collect", axes="collect")) / (log_reg_R_plot + ggtitle(label="b) Logistic regression", subtitle="R") + log_reg_NWT_plot + ggtitle(label="", subtitle="NWT") + plot_layout(guides="collect", axes="collect"))

ggsave(width=8, height=9, file="figs/Fig1_individual_variable_stats.pdf")
ggsave(width=8, height=9, file="figs/Fig1_individual_variable_stats.png")

Model performance

model predictions - individual determinants

# R models
model_R_indiv_pred <- predict(model_R_indiv, type="response")
model_R_indiv_aac6Interaction_pred <- predict(model_R_indiv_aac6Interaction, type="response")

# NWT models
model_NWT_indiv_pred <- predict(model_NWT_indiv, type="response")
model_NWT_indiv_aac6Interaction_pred <- predict(model_NWT_indiv_aac6Interaction, type="response")

Logistic regression at the counts level - R, NWT

# R models
model_R_groups <- logistf(resistant ~ QRDR_mutations + acquired_genes + aac6, data=cipro_antibiogram)
model_R_groups_selectInteract <- logistf(resistant ~ QRDR_mutations*aac6 + acquired_genes*aac6, data=cipro_antibiogram)

# NWT models
model_NWT_groups <- logistf(nonWT_binary ~ QRDR_mutations + acquired_genes + aac6, data=cipro_antibiogram)
model_NWT_groups_selectInteract <- logistf(nonWT_binary ~ QRDR_mutations*aac6 + acquired_genes*aac6, data=cipro_antibiogram)

# R model predictions
model_R_groups_pred <- predict(model_R_groups, type="response")
model_R_groups_selectInteract_pred <- predict(model_R_groups_selectInteract, type="response")

# NWT model predictions
model_NWT_groups_pred <- predict(model_NWT_groups, type="response")
model_NWT_groups_selectInteract_pred <- predict(model_NWT_groups_selectInteract, type="response")
count_model_summary <- logreg_details(model_R_groups) %>% mutate(Model="2a", response="R") %>%
  bind_rows(logreg_details(model_R_groups_selectInteract) %>% mutate(Model="2b", response="R")) %>% 
  bind_rows(logreg_details(model_NWT_groups) %>% mutate(Model="2a", response="NWT")) %>%
  bind_rows(logreg_details(model_NWT_groups_selectInteract) %>% mutate(Model="2b", response="NWT")) %>%
  relocate(Model, .before=Determinant) %>% relocate(response, .before=Model) %>%
  write_csv(file="tables/TableS7_CountLogRegCoef.csv")

Table 1 - model summary table

aic <- c(extractAIC(model_R_indiv)[2], 
         extractAIC(model_R_indiv_aac6Interaction)[2], 
         extractAIC(model_R_groups)[2], 
         extractAIC(model_R_groups_selectInteract)[2], 
         extractAIC(model_NWT_indiv)[2], 
         extractAIC(model_NWT_indiv_aac6Interaction)[2], 
         extractAIC(model_NWT_groups)[2], 
         extractAIC(model_NWT_groups_selectInteract)[2])

anova_result <- c("-",anova(model_R_indiv, model_R_indiv_aac6Interaction)$pval,
                  "-",anova(model_R_groups, model_R_groups_selectInteract)$pval,
                  "-",anova(model_NWT_indiv, model_NWT_indiv_aac6Interaction)$pval, 
                  "-", anova(model_NWT_groups, model_NWT_groups_selectInteract)$pval)

metrics_result <- rbind(
  metrics(model_R_indiv_pred>0.5, R_vs_dets$resistant)$summary,
  metrics(model_R_indiv_aac6Interaction_pred>0.5, R_vs_dets$resistant)$summary,
  metrics(model_R_groups_pred>0.5, cipro_antibiogram$resistant)$summary,
  metrics(model_R_groups_selectInteract_pred>0.5, R_vs_dets$resistant)$summary,
  metrics(model_NWT_indiv_pred>0.5, NWT_vs_dets$nonWT_binary)$summary,
  metrics(model_NWT_indiv_aac6Interaction_pred>0.5, NWT_vs_dets$nonWT_binary)$summary,
  metrics(model_NWT_groups_pred>0.5, cipro_antibiogram$nonWT_binary)$summary,
  metrics(model_NWT_groups_selectInteract_pred>0.5, cipro_antibiogram$nonWT_binary)$summary
)

model_summary_tab <- cbind(aic, anova_result, metrics_result) %>% as_tibble() %>%
  rename(Sensitivity=sens, Specificity=spec, ME=me, VME=vme, `Categorical agreement`=cat) %>%
  rename(`model fit (aac6 interaction)`=anova_result) %>% 
  rename(AIC=aic) %>%
  mutate(ModelName=c("Model 1a", "Model 1b", "Model 2a", "Model 2b", "Model 1a", "Model 1b", "Model 2a", "Model 2b")) %>%
  mutate(response = c(rep("R",4), rep("NWT",4))) %>%
  mutate(predictors = rep(c("determinants", "determinant*aac6", "QRDR count + PMQR count + aac6","QRDR count*aac6 + PMQR count*aac6"),2))

NOT USED - logistic regression model summary - facet plot

model_summary_tab_facet <- model_summary_tab
model_summary_tab_facet <- model_summary_tab_facet %>% 
    mutate(Sensitivity=round(as.numeric(Sensitivity)*100,2)) %>%
  mutate(Specificity=round(as.numeric(Specificity)*100,2)) %>%
  mutate(ME=round(as.numeric(ME)*100,2)) %>%
  mutate(VME=round(as.numeric(VME)*100,2)) %>%
  mutate(`Categorical agreement`=round(as.numeric(`Categorical agreement`)*100,2)) %>%
    mutate(AIC=round(as.numeric(AIC),0)) %>%
  mutate(Label=paste0(ModelName, ": ", predictors)) %>%
  rename(Prediction=response)%>%
  select(Label, Prediction, `Categorical agreement`, Sensitivity, Specificity, ME, VME, AIC) %>%
  rename(`Major error`=ME, `Very major error`=VME)

model_summary_tab_long <- model_summary_tab_facet %>%
  pivot_longer(
   `Categorical agreement`:`AIC`, 
    names_to = "Evaluation metric",
    values_to = "value"
)


model_summary_tab_long <- model_summary_tab_long%>%
  mutate(`Evaluation metric` = factor(`Evaluation metric`, levels=c("Categorical agreement", "Sensitivity", "Specificity", "Major error", "Very major error", "AIC"))) 
  
model_summary_tab_long <- model_summary_tab_long%>%
  mutate(Label = factor(Label, levels=c("Model 1a: determinants", "Model 1b: determinant*aac6", "Model 2a: QRDR count + PMQR count + aac6", "Model 2b: QRDR count*aac6 + PMQR count*aac6"))) 
  

model_summary_plot <-
model_summary_tab_long %>% ggplot(aes(x=(as.numeric(value)), y=fct_rev(Label)))+
  geom_point(aes(colour=Prediction), position=pd) +
    geom_text(aes(label=as.numeric(value), colour=Prediction), size=3,position=pd, fontface = "bold", vjust=-0.5, hjust=0.5)+
  scale_color_manual(values=c("R"="IndianRed", "NWT"="#78638a")) +
 facet_wrap(~ `Evaluation metric`, ncol = 6, scales="free_x")+
  theme_linedraw() +
  theme(
    axis.title.x = element_blank(),
    axis.title.y = element_blank())+
  facetted_pos_scales(
    x = list(`Evaluation metric` == "Categorical agreement" ~ scale_x_continuous(breaks = c(80,100), limits = c(70, 105)),
             `Evaluation metric` == "Sensitivity" ~ scale_x_continuous(breaks = c(80,100), limits = c(65, 100)),
             `Evaluation metric` == "Specificity" ~ scale_x_continuous(breaks = c(80, 100), limits = c(70, 105)),
             `Evaluation metric` == "Major error" ~ scale_x_continuous(breaks = c(0,10), limits = c(0,10)),
             `Evaluation metric` == "Very major error"~ scale_x_continuous(breaks = c(10,20), limits = c(0, 32)),             `Evaluation metric` == "AIC"~ scale_x_continuous(breaks = c(12000,13000), limits = c(11500, 13800))
  ))

model_summary_plot

Rules-based classifier

cipro_antibiogram <- cipro_antibiogram %>% 
  mutate(QRDR_mutations_exclude = str_count(Flq_mutations, "GyrA-87G") + str_count(Flq_mutations, "GyrA-87H")) %>% 
  mutate(PMQR_exclude = str_count(Flq_acquired, "qnrB1.v1") + str_count(Flq_acquired, "qnrB1.v2")) %>%
  mutate(rules_category = case_when(
    QRDR_mutations==0 & acquired_genes==0 & aac6==0 ~ 0, # no determinants -> WT S
    QRDR_mutations==1 & acquired_genes==0 & aac6==0 & QRDR_mutations_exclude==1 ~ 0, # non-associated QRDR solo -> WT S
    QRDR_mutations==0 & acquired_genes==0 & aac6==1 ~ 1, # aac6 solo -> WT S
    QRDR_mutations==0 & acquired_genes==1 & aac6==0 & PMQR_exclude==1 ~ 2, # non-associated PMQR solo -> NWT I
    QRDR_mutations==1 & acquired_genes==0 & aac6==0 & QRDR_mutations_exclude==0 ~ 3, # 1 QRDR, except those not associated -> NWT R
    QRDR_mutations==1 & acquired_genes==0 & aac6==1 ~ 4, # 1 QRDR + aac6 -> NWT R
    QRDR_mutations>=2 & acquired_genes==0 ~ 5, # ≥2 QRDR -> NWT R
    QRDR_mutations==0 & acquired_genes==1 & aac6==0 & PMQR_exclude==0 ~ 6, # 1 PMQR, except those not associated -> NWT R
    QRDR_mutations==0 & acquired_genes==1 & aac6==1 ~ 7, # 1 PMQR + aac6
    QRDR_mutations==0 & acquired_genes>=2 ~ 8, # ≥2 PMQR -> NWT R
    QRDR_mutations>=1 & acquired_genes>=1 ~ 9, # QRDR + PMQR -> NWT R
    TRUE ~ NA
  ))

# check all are assigned
table(cipro_antibiogram$rules_category)
## 
##    0    1    2    3    4    5    6    7    8    9 
## 5680  161  160  103   23 2167  546  819   68 2440
table(is.na(cipro_antibiogram$rules_category))
## 
## FALSE 
## 12167
cipro_antibiogram %>% filter(is.na(rules_category)) %>% select(Flq_mutations, Flq_acquired)
# assign S/R predictions to categories
cipro_antibiogram <- cipro_antibiogram %>% 
  mutate(predSIR = if_else(rules_category<=2, "S", "R")) %>%
  mutate(predSIR = replace(predSIR, rules_category==2, "I")) %>%
  mutate(predSIR = factor(predSIR, levels=c("S", "I", "R"))) %>%
  mutate(predR = if_else(rules_category<=2, "S", "R")) %>%
  mutate(predR = factor(predR, levels=c("S", "R"))) %>%
  mutate(predNWT = if_else(rules_category<2, "WT", "NWT")) %>%
  mutate(predNWT = factor(predNWT, levels=c("WT", "NWT")))

table(cipro_antibiogram$rules_category, cipro_antibiogram$predR)
##    
##        S    R
##   0 5680    0
##   1  161    0
##   2  160    0
##   3    0  103
##   4    0   23
##   5    0 2167
##   6    0  546
##   7    0  819
##   8    0   68
##   9    0 2440
table(cipro_antibiogram$rules_category, cipro_antibiogram$predSIR)
##    
##        S    I    R
##   0 5680    0    0
##   1  161    0    0
##   2    0  160    0
##   3    0    0  103
##   4    0    0   23
##   5    0    0 2167
##   6    0    0  546
##   7    0    0  819
##   8    0    0   68
##   9    0    0 2440
table(cipro_antibiogram$rules_category, cipro_antibiogram$predNWT)
##    
##       WT  NWT
##   0 5680    0
##   1  161    0
##   2    0  160
##   3    0  103
##   4    0   23
##   5    0 2167
##   6    0  546
##   7    0  819
##   8    0   68
##   9    0 2440

Table 1b and numbers for text - classifier overall metrics, by species

species_R <- cipro_antibiogram %>% group_by(species) %>% 
  filter(species != "Klebsiella variicola subsp. tropica") %>%
  filter(species != "Klebsiella quasivariicola") %>%
  filter(species != "Klebsiella africana") %>%
  summarise(cat=cat_agree(predR, resistant), 
            sens=sens(predR, resistant), 
            spec=spec(predR, resistant), 
            me=me(predR, resistant), 
            vme=vme(predR, resistant),
            n=n())%>%
  mutate(out=rep("R", 4)) %>% 
  relocate(out, .before=species)

species_NWT <- cipro_antibiogram %>% group_by(species) %>% 
  filter(species != "Klebsiella variicola subsp. tropica") %>%
  filter(species != "Klebsiella quasivariicola") %>%
  filter(species != "Klebsiella africana") %>%
  summarise(cat=cat_agree(predNWT, nonWT_binary), 
            sens=sens(predNWT, nonWT_binary), 
            spec=spec(predNWT, nonWT_binary), 
            me=me(predNWT, nonWT_binary), 
            vme=vme(predNWT, nonWT_binary),
            n=n()) %>%
  mutate(out=rep("NWT", 4)) %>% 
  relocate(out, .before=species)

species_summary_table <- 
bind_rows(c("out"="R","species"="Klebsiella pneumoniae species complex", 
            metrics(cipro_antibiogram$predR, cipro_antibiogram$resistant)$summary), 
          c("out"="NWT","species"="Klebsiella pneumoniae species complex",
            metrics(cipro_antibiogram$predNWT, cipro_antibiogram$nonWT_binary)$summary)
          ) %>%
  mutate_at(c("cat","sens","spec","me","vme", "n"), as.double) %>%
  bind_rows(species_R) %>%
  bind_rows(species_NWT) %>%
  rename(Sensitivity=sens, Specificity=spec, ME=me, VME=vme, `Categorical agreement`=cat, N=n) %>%
  mutate(Sensitivity=paste0(round(as.numeric(Sensitivity)*100,2),"%")) %>%
  mutate(Specificity=paste0(round(as.numeric(Specificity)*100,2),"%")) %>%
  mutate(ME=paste0(round(as.numeric(ME)*100,2),"%")) %>%
  mutate(VME=paste0(round(as.numeric(VME)*100,2),"%")) %>%
  mutate(`Categorical agreement`=paste0(round(as.numeric(`Categorical agreement`)*100,2),"%")) %>%
  select(out, species, `Categorical agreement`, Sensitivity, Specificity, ME, VME, N)
  

write_csv(species_summary_table, "tables/Table1b_ModelSummary.csv", na="-")

NOT USED - rules-based classifier species summary - facet plot

species_summary_table <- 
bind_rows(c("out"="R","species"="Klebsiella pneumoniae species complex", 
            metrics(cipro_antibiogram$predR, cipro_antibiogram$resistant)$summary), 
          c("out"="NWT","species"="Klebsiella pneumoniae species complex",
            metrics(cipro_antibiogram$predNWT, cipro_antibiogram$nonWT_binary)$summary)
          ) %>%
  mutate_at(c("cat","sens","spec","me","vme", "n"), as.double) %>%
  bind_rows(species_R) %>%
  bind_rows(species_NWT) %>%
  rename(Sensitivity=sens, Specificity=spec, ME=me, VME=vme, `Categorical agreement`=cat, N=n) %>%
  mutate(Sensitivity=round(as.numeric(Sensitivity)*100,2)) %>%
  mutate(Specificity=round(as.numeric(Specificity)*100,2)) %>%
  mutate(ME=round(as.numeric(ME)*100,2)) %>%
  mutate(VME=round(as.numeric(VME)*100,2)) %>%
  mutate(`Categorical agreement`=round(as.numeric(`Categorical agreement`)*100,2)) %>%
  rename(Prediction=out)%>%
  mutate(species=gsub("Klebsiella quasipneumoniae subsp. quasipneumoniae","Kq subsp. quasipneumoniae",species)) %>%
    mutate(species=gsub("Klebsiella quasipneumoniae subsp. similipneumoniae","Kq subsp. similipneumoniae",species)) %>%
    mutate(species=gsub("Klebsiella variicola subsp. variicola","Klebsiella variicola",species)) %>%
  select(species, Prediction, `Categorical agreement`, Sensitivity, Specificity, ME, VME) %>%
  rename(`Major error`=ME, `Very major error`=VME, Label=species)

species_summary_table_long <- species_summary_table %>%
  pivot_longer(
   `Categorical agreement`:`Very major error`, 
    names_to = "Evaluation metric",
    values_to = "value"
)


species_summary_table_long <- species_summary_table_long%>%
  mutate(`Evaluation metric` = factor(`Evaluation metric`, levels=c("Categorical agreement", "Sensitivity", "Specificity", "Major error", "Very major error"))) 
  
species.labels=c("<br>*Klebsiella pnemoniae* species complex (N=12,167)",
                 "<br>*Klebsiella pneumoniae* (N=10,634)",
                 "<br>*Kq* subsp. <br>*quasipneumoniae* (N=189)",
                 "<br>*Kq* subsp. <br>*similipneumoniae* (N=286)",
                 "<br>*Klebsiella variicola* (N=1,037)")

species_summary_table_long <- species_summary_table_long%>%
  mutate(Label = factor(Label, levels=c("Klebsiella pneumoniae species complex", "Klebsiella pneumoniae", "Kq subsp. quasipneumoniae", "Kq subsp. similipneumoniae", "Klebsiella variicola"))) 


rules_species_summary_plot <-
species_summary_table_long %>% ggplot(aes(x=(as.numeric(value)), y=fct_rev(Label)))+
  geom_point(aes(colour=Prediction), position=pd) +
    geom_text(aes(label=as.numeric(value), colour=Prediction), size=3,position=pd, fontface = "bold", vjust=-0.5, hjust=0.5)+
  scale_color_manual(values=c("R"="IndianRed", "NWT"="#78638a")) +
   scale_y_discrete(labels=rev(species.labels)) +
 facet_wrap(~ `Evaluation metric`, ncol = 5, scales="free_x")+
  theme_linedraw() +
  theme(
    axis.title.x = element_blank(),
    axis.title.y = element_blank(),
    axis.text.y = element_markdown())+
  facetted_pos_scales(
    x = list(`Evaluation metric` == "Categorical agreement" ~ scale_x_continuous(breaks = c(80,100), limits = c(70, 105)),
             `Evaluation metric` == "Sensitivity" ~ scale_x_continuous(breaks = c(80,100), limits = c(65, 100)),
             `Evaluation metric` == "Specificity" ~ scale_x_continuous(breaks = c(80, 100), limits = c(70, 105)),
             `Evaluation metric` == "Major error" ~ scale_x_continuous(breaks = c(0,10), limits = c(0,10)),
             `Evaluation metric` == "Very major error"~ scale_x_continuous(breaks = c(10,20), limits = c(0, 32))
  ))
 
rules_species_summary_plot

NOT USED facet plot - combined LR models + rules-based classifier

((model_summary_plot + ggtitle(label="a) Logistic regression models")) / (rules_species_summary_plot + ggtitle(label="b) Rules-based classifiers")) + plot_layout(guides="collect", axes="collect"))

#ggsave(height=8, width=12, file="figs/LRmodel_classifier_performance.png")
#ggsave(height=8, width=12, file="figs/LRmodel_classifier_performance.pdf")

NOT USED - Aligned facet plot - combined LR models + rules-based classifier

species_summary_table_long$Category <- "Rules-based classifiers"
model_summary_tab_long$Category <- "Logistic regression models"


performance_summary <- rbind(species_summary_table_long, model_summary_tab_long)

performance_summary <- performance_summary%>%
  mutate(`Evaluation metric` = factor(`Evaluation metric`, levels=c("Categorical agreement", "Sensitivity", "Specificity", "Major error", "Very major error", "AIC"))) 
  
performance_summary <- performance_summary%>%
  mutate(Label = factor(Label, levels=c("Model 1a: determinants", 
           "Model 1b: determinant*aac6", 
           "Model 2a: QRDR count + PMQR count + aac6", 
           "Model 2b: QRDR count*aac6 + PMQR count*aac6","Klebsiella pneumoniae species complex", "Klebsiella pneumoniae", "Kq subsp. quasipneumoniae", "Kq subsp. similipneumoniae", "Klebsiella variicola"))) 

performance_summary <- performance_summary%>%
  mutate(Category = factor(Category, levels=c("Logistic regression models", "Rules-based classifiers"))) 

y_labels=c("Model 1a: determinants"="Model 1a: determinants", 
           "Model 1b: determinant*aac6"="Model 1b: determinant*aac6", 
           "Model 2a: QRDR count + PMQR count + aac6" ="Model 2a: QRDR count + PMQR count + aac6",
           "Model 2b: QRDR count*aac6 + PMQR count*aac6"="Model 2b: QRDR count*aac6 + PMQR count*aac6",
           "Klebsiella pneumoniae species complex"="<br>*Klebsiella pnemoniae* species complex (N=12,167)",
                 "Klebsiella pneumoniae"="<br>*Klebsiella pneumoniae* (N=10,634)",
                 "Kq subsp. quasipneumoniae"="<br>*Kq* subsp. <br>*quasipneumoniae* (N=189)",
                 "Kq subsp. similipneumoniae"="<br>*Kq* subsp. <br>*similipneumoniae* (N=286)",
                 "Klebsiella variicola"="<br>*Klebsiella variicola* (N=1,037)")

performance_summary_plot <-
performance_summary %>% ggplot(aes(x=(as.numeric(value)), y=fct_rev(Label)))+
 geom_point(aes(colour=Prediction), position=pd) +
    geom_text(aes(label=as.numeric(value), colour=Prediction), size=3,position=pd, fontface = "bold", vjust=-0.5, hjust=0.5)+
  scale_color_manual(values=c("R"="IndianRed", "NWT"="#78638a")) +
facet_grid(rows=vars(Category), cols=vars(`Evaluation metric`), scales="free", drop=FALSE)+
   scale_y_discrete(labels=(y_labels)) +
  theme_linedraw() +
  theme(
    axis.title.x = element_blank(),
    axis.title.y = element_blank(),
    axis.text.y = element_markdown())+
  facetted_pos_scales(
    x = list(`Evaluation metric` == "Categorical agreement" ~ scale_x_continuous(breaks = c(80,100), limits = c(70, 105)),
             `Evaluation metric` == "Sensitivity" ~ scale_x_continuous(breaks = c(80,100), limits = c(65, 100)),
             `Evaluation metric` == "Specificity" ~ scale_x_continuous(breaks = c(80, 100), limits = c(70, 105)),
             `Evaluation metric` == "Major error" ~ scale_x_continuous(breaks = c(0,10), limits = c(0,10)),
             `Evaluation metric` == "Very major error"~ scale_x_continuous(breaks = c(10,20), limits = c(0, 32)),             `Evaluation metric` == "AIC"~ scale_x_continuous(breaks = c(12000,13000), limits = c(11500, 13800))
  ))

#ggsave(height=6, width=12, file="figs/LRmodel_classifier_performance_aligned.png")
#ggsave(height=6, width=12, file="figs/LRmodel_classifier_performance_aligned.pdf")



performance_summary_plot <-
performance_summary %>% ggplot(aes(x=(as.numeric(value)), y=fct_rev(Label)))+
#  geom_point(aes(colour=Prediction), position=pd) +
    geom_text(aes(label=as.numeric(value), colour=Prediction), size=3,position=pd, fontface = "bold")+
  scale_color_manual(values=c("R"="IndianRed", "NWT"="#78638a")) +
facet_grid(rows=vars(Category), cols=vars(`Evaluation metric`), scales="free", drop=FALSE)+
   scale_y_discrete(labels=(y_labels)) +
  theme_linedraw() +
  theme(
    axis.title.x = element_blank(),
    axis.title.y = element_blank(),
    axis.text.y = element_markdown())+
  facetted_pos_scales(
    x = list(`Evaluation metric` == "Categorical agreement" ~ scale_x_continuous(breaks = c(80,100), limits = c(70, 105)),
             `Evaluation metric` == "Sensitivity" ~ scale_x_continuous(breaks = c(80,100), limits = c(65, 100)),
             `Evaluation metric` == "Specificity" ~ scale_x_continuous(breaks = c(80, 100), limits = c(70, 105)),
             `Evaluation metric` == "Major error" ~ scale_x_continuous(breaks = c(0,10), limits = c(0,10)),
             `Evaluation metric` == "Very major error"~ scale_x_continuous(breaks = c(10,20), limits = c(0, 32)),             `Evaluation metric` == "AIC"~ scale_x_continuous(breaks = c(12000,13000), limits = c(11500, 13800))
  ))

#ggsave(height=6, width=12, file="figs/LRmodel_classifier_performance_aligned_number.png")
#ggsave(height=6, width=12, file="figs/LRmodel_classifier_performance_aligned_number.pdf")

Figure 6 - classifier overall metrics, by ST, Country, Source Type, Infection status

ST_R <- cipro_antibiogram %>% group_by(ST) %>% 
  summarise(sens=sens(predR, resistant)*100, 
            spec=spec(predR, resistant)*100, 
            cat=cat_agree(predR, resistant)*100, 
            me=me(predR, resistant)*100, 
            vme=vme(predR, resistant)*100, 
            n=n(),
            n_dataset=n_distinct(index),
            susceptible=n-sum(resistant),
            resistant=sum(resistant),
            percent_R=(sum(resistant))/n*100) %>%
  mutate(out="R") %>% 
  mutate(category="ST") %>% 
  relocate(out, category, .before=ST)%>%
  rename(category_name=ST)
Country_R <- cipro_antibiogram %>% group_by(Country) %>% 
  summarise(sens=sens(predR, resistant)*100, 
            spec=spec(predR, resistant)*100, 
            cat=cat_agree(predR, resistant)*100, 
            me=me(predR, resistant)*100, 
            vme=vme(predR, resistant)*100, 
            n=n(),
            n_dataset=n_distinct(index),
            susceptible=n-sum(resistant),
            resistant=sum(resistant),
            percent_R=(sum(resistant))/n*100) %>%
  mutate(out="R") %>% 
  mutate(category="Country") %>%   
  relocate(out, category, .before=Country)%>%
  rename(category_name=Country)
SourceType_R <- cipro_antibiogram %>% group_by(Source.Type) %>% 
  summarise(sens=sens(predR, resistant)*100, 
            spec=spec(predR, resistant)*100, 
            cat=cat_agree(predR, resistant)*100, 
            me=me(predR, resistant)*100, 
            vme=vme(predR, resistant)*100, 
            n=n(),
            n_dataset=n_distinct(index),
            susceptible=n-sum(resistant),
            resistant=sum(resistant),
            percent_R=(sum(resistant))/n*100) %>%
  mutate(out="R") %>% 
  mutate(category="Source") %>%  
  relocate(out, category, .before=Source.Type)%>%
  rename(category_name=Source.Type)
cipro_antibiogram$Infection.status <- cipro_antibiogram$Infection.status %>% replace_na("Unknown")

InfectionStatus_R <- cipro_antibiogram %>% group_by(Infection.status) %>% 
  summarise(sens=sens(predR, resistant)*100, 
            spec=spec(predR, resistant)*100, 
            cat=cat_agree(predR, resistant)*100, 
            me=me(predR, resistant)*100, 
            vme=vme(predR, resistant)*100, 
            n=n(),
            n_dataset=n_distinct(index),
            susceptible=n-sum(resistant),
            resistant=sum(resistant),
            percent_R=(sum(resistant))/n*100) %>%
  mutate(out="R") %>% 
  mutate(category="Inf") %>% 
  relocate(out, category, .before=Infection.status) %>%
  rename(category_name=Infection.status)
cipro_antibiogram <- cipro_antibiogram %>%
  mutate(platform_type=case_when(
    Laboratory.Typing.Platform=="Sensititre" ~ "Auto",
    Laboratory.Typing.Platform=="Phoenix" ~ "Auto",
    Laboratory.Typing.Platform=="Vitek" ~ "Auto",
    Laboratory.Typing.Platform=="Manual" ~ "Manual",
    TRUE ~ "Unknown"
  ))

platform_R <- cipro_antibiogram %>% 
  group_by(platform_type) %>% 
  summarise(sens=sens(predR, resistant)*100, 
            spec=spec(predR, resistant)*100, 
            cat=cat_agree(predR, resistant)*100, 
            me=me(predR, resistant)*100, 
            vme=vme(predR, resistant)*100, 
            n=n(),
            n_dataset=n_distinct(index),
            susceptible=n-sum(resistant),
            resistant=sum(resistant),
            percent_R=(sum(resistant))/n*100) %>%
  mutate(out="R") %>% 
  mutate(category="Plat") %>% 
  relocate(out, category, .before=platform_type)%>%
  rename(category_name=platform_type)

platform_R
Combined_R <- bind_rows(ST_R, Country_R, SourceType_R, InfectionStatus_R, platform_R)

Combined_R <- Combined_R %>%
  rename(Sensitivity=sens, Specificity=spec, ME=me, VME=vme, `Categorical agreement`=cat, N=n, `Resistant`=percent_R, Category=category, Prediction=out, `Category Name`=category_name) %>%
  write_csv("tables/TableSX_ModelSummary_Combined.csv", na="-")

#filter for categories: susceptible genomes >10, more than 25 genomes, more than 2 datasets
Combined_R_100 <- Combined_R %>% 
  filter(susceptible>10) %>%
  filter(N>25) %>%
  filter(n_dataset >=3) %>%
  mutate(Category_label=paste(`Category Name`, " (n=", N,", S=", susceptible, ")", sep = "")) %>%
  select(Category, Category_label, N, Sensitivity, Specificity, `Resistant`)

# convert table from wide to long for plotting 
Combined_R_100_long<-gather(Combined_R_100, `Evaluation metric`, value, Sensitivity:`Resistant`)
Combined_R_100_long <- Combined_R_100_long[order(Combined_R_100_long$N, decreasing = TRUE),]


Combined_R_100_plot <-ggplot(Combined_R_100_long, aes(as.numeric(value),fct_inorder(as.factor(Category_label)))) +
  geom_point(aes(shape = `Evaluation metric`, size=`Evaluation metric`, colour=`Evaluation metric`), alpha=0.5) +
  scale_shape_manual(values = c(Sensitivity=15, Specificity=18, Resistant=3)) +
  scale_size_manual(values = c(Sensitivity=3, Specificity=3, Resistant=1)) +
  scale_colour_manual(values=c(Sensitivity="#8eb567", Specificity="#9a98ea", Resistant="IndianRed")) +
  scale_y_discrete(limits=rev)+
  facet_grid(Category~., scales = "free", space='free')+ 
  labs(y="", x = "%") + 
  theme_bw()+
  theme(legend.title=element_blank())

Combined_R_100_plot

ggsave(height=8, width=6, file="figs/Fig6_classifierPerformance_category.png")
ggsave(height=8, width=6, file="figs/Fig6_classifierPerformance_category.pdf")

performance by study

Study_R <- cipro_antibiogram %>% group_by(index) %>% 
  summarise(sens=sens(predR, resistant)*100, 
            spec=spec(predR, resistant)*100, 
            cat=cat_agree(predR, resistant)*100, 
            me=me(predR, resistant)*100, 
            vme=vme(predR, resistant)*100, 
            n=n(),
            percent_R=(sum(resistant))/n*100) %>%
  mutate(out="R") %>% 
  mutate(category="index") %>% 
  relocate(out, category, .before=index)%>%
  rename(category_name=index)

performance by MIC platform

platform_R <- cipro_antibiogram %>% 
  filter(Laboratory.Typing.Method!="Disk diffusion") %>%
  group_by(Laboratory.Typing.Platform) %>% 
  summarise(sens=sens(predR, resistant)*100, 
            spec=spec(predR, resistant)*100, 
            cat=cat_agree(predR, resistant)*100, 
            me=me(predR, resistant)*100, 
            vme=vme(predR, resistant)*100, 
            n=n(),
            percent_R=(sum(resistant))/n*100) %>%
  mutate(out="R") %>% 
  mutate(category="Laboratory.Typing.Platform") %>% 
  relocate(out, category, .before=Laboratory.Typing.Platform)%>%
  rename(category_name=Laboratory.Typing.Platform)

platform_R
cipro_antibiogram %>% 
  filter(Laboratory.Typing.Method!="Disk diffusion") %>%
  group_by(Laboratory.Typing.Platform) %>% summarise(studies=length(unique(index)))

performance for DD

method_R <- cipro_antibiogram %>% 
  group_by(Laboratory.Typing.Method) %>% 
  summarise(sens=sens(predR, resistant)*100, 
            spec=spec(predR, resistant)*100, 
            cat=cat_agree(predR, resistant)*100, 
            me=me(predR, resistant)*100, 
            vme=vme(predR, resistant)*100, 
            n=n(),
            percent_R=(sum(resistant))/n*100) %>%
  mutate(out="R") %>% 
  mutate(category="Laboratory.Typing.Method") %>% 
  relocate(out, category, .before=Laboratory.Typing.Method)%>%
  rename(category_name=Laboratory.Typing.Method)
cipro_antibiogram <- cipro_antibiogram %>% 
  mutate(correctR=case_when(predR=="R" & resistant==1 ~ 1, predR=="S" & resistant==0 ~ 1, TRUE ~ 0)) %>%
  mutate(method=if_else(Laboratory.Typing.Method=="Disk diffusion", "DD", "MIC"))

cipro_antibiogram %>% group_by(correctR, predR, resistant) %>% count()
study_method <- logistf(correctR ~ method + index + Source.Type + Infection.status, data=cipro_antibiogram)

summary(study_method)
## logistf(formula = correctR ~ method + index + Source.Type + Infection.status, 
##     data = cipro_antibiogram)
## 
## Model fitted by Penalized ML
## Coefficients:
##                                 coef  se(coef) lower 0.95   upper 0.95
## (Intercept)                5.8570120 0.8564655  4.2184530  7.658961177
## methodMIC                 -0.0899076 0.4747928 -1.0180428  0.878769944
## indexBARNARDS             -1.4160665 0.5737283 -2.7364322 -0.419611810
## indexBSAC                 -0.6989136 0.5945535 -2.0472841  0.354006636
## indexColombia_GHRU         0.1813858 0.6119892 -1.1899732  1.283355039
## indexDenmark_MedVetKlebs  -2.2117851 1.5600901 -5.1541267  2.848904929
## indexEgli                 -1.1519461 0.6538546 -2.5879685  0.041925492
## indexEllington             1.7138286 1.5195999 -0.6446311  6.621759032
## indexHeinz_2019           -1.4100155 0.5849177 -2.7456413 -0.383978292
## indexHuynh_2020           -0.6671717 0.8110845 -2.3647398  0.897857811
## indexIndia_GHRU           -0.1874074 0.5690257 -1.5012259  0.797115455
## indexItaly_MedVetKlebs    -1.7376382 1.0376602 -3.7402136  0.737418522
## indexKASPAH               -0.4145582 0.5798172 -1.7433452  0.598982126
## indexKLEBGAP               0.4362343 0.7217077 -1.1128863  1.811414866
## indexLeangapichart_2021   -3.1431028 0.6964239 -4.6461842 -1.848722010
## indexLong                  0.6524075 0.5722424 -0.6657149  1.645699290
## indexMathers              -0.3750241 0.8500158 -2.0521966  1.450809422
## indexMilan_2019            0.2437530 0.8087871 -1.3985282  1.887236282
## indexNigeria_GHRU         -0.4019747 0.6792921 -1.8625070  0.890156733
## indexPerdigao_2022        -1.3995000 0.7407401 -2.9854852  0.005257405
## indexPhilippines          -0.9322924 0.5873938 -2.2711442  0.100891220
## indexPotter_2018           0.2861738 1.0997035 -1.7833069  2.796837808
## indexSands_2021           -3.6366440 0.9330159 -5.5596073 -1.803232729
## indexSPARK                -0.2411907 0.5719818 -1.5595632  0.751221034
## indexStoesser_2013        -0.4820626 0.7316590 -2.0107663  0.964989633
## indexTorok_2020            1.0511654 0.6248395 -0.3393404  2.186982133
## indexWright                0.2867726 0.8429557 -1.3770025  2.101823402
## Source.TypeEnvironment     0.1880697 0.7157825 -1.1420680  1.874948165
## Source.TypeFeed           -0.3903375 1.6841289 -3.4914472  4.649184688
## Source.TypeFood            1.3220676 1.4750241 -1.6730186  6.326619101
## Source.TypeHuman          -2.0098044 0.4242724 -2.9213898 -1.219666222
## Source.TypeMarine         -0.4111707 1.4769433 -2.5741050  4.467733343
## Infection.statusInfection -0.5972647 0.2096206 -1.0231962 -0.192208138
## Infection.statusUnknown   -0.6946898 0.2912958 -1.2837835 -0.131104697
##                                 Chisq            p method
## (Intercept)               58.81523742 1.731948e-14      2
## methodMIC                  0.03328552 8.552349e-01      2
## indexBARNARDS              8.52073728 3.511222e-03      2
## indexBSAC                  1.58913641 2.074496e-01      2
## indexColombia_GHRU         0.08496624 7.706768e-01      2
## indexDenmark_MedVetKlebs   1.14944935 2.836645e-01      2
## indexEgli                  3.56170038 5.912714e-02      2
## indexEllington             1.84595483 1.742546e-01      2
## indexHeinz_2019            7.80285160 5.216385e-03      2
## indexHuynh_2020            0.67782951 4.103351e-01      2
## indexIndia_GHRU            0.11382632 7.358293e-01      2
## indexItaly_MedVetKlebs     2.11237987 1.461119e-01      2
## indexKASPAH                0.56282928 4.531225e-01      2
## indexKLEBGAP               0.34177884 5.588042e-01      2
## indexLeangapichart_2021   23.86842573 1.031498e-06      2
## indexLong                  1.10463487 2.932512e-01      2
## indexMathers               0.19016311 6.627809e-01      2
## indexMilan_2019            0.09062892 7.633791e-01      2
## indexNigeria_GHRU          0.36125861 5.478081e-01      2
## indexPerdigao_2022         3.81150070 5.090183e-02      2
## indexPhilippines           3.06768967 7.986290e-02      2
## indexPotter_2018           0.06830039 7.938277e-01      2
## indexSands_2021           14.95387013 1.101719e-04      2
## indexSPARK                 0.18874060 6.639669e-01      2
## indexStoesser_2013         0.43843216 5.078802e-01      2
## indexTorok_2020            2.34112846 1.259982e-01      2
## indexWright                0.11711311 7.321869e-01      2
## Source.TypeEnvironment     0.06677484 7.960918e-01      2
## Source.TypeFeed            0.04954485 8.238572e-01      2
## Source.TypeFood            0.73714824 3.905759e-01      2
## Source.TypeHuman          30.25397897 3.790129e-08      2
## Source.TypeMarine          0.06909543 7.926584e-01      2
## Infection.statusInfection  8.50808728 3.535715e-03      2
## Infection.statusUnknown    5.88538359 1.526707e-02      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=397.9632 on 33 df, p=0, n=12167
## Wald test = 3883.461 on 33 df, p = 0

classifier - per genotype profile PPV

category_phenotype_table <- cipro_antibiogram %>% 
  group_by(rules_category, resistant) %>% 
  count() %>% 
  ungroup() %>%
  pivot_wider(names_from=resistant, values_from=n, values_fill=0) %>%
  rename(SI=`0`, R=`1`)

category_phenotype_table <- cipro_antibiogram %>% 
  group_by(rules_category, WT_vs_nonWT) %>% 
  count() %>% 
  ungroup() %>%
  pivot_wider(names_from=WT_vs_nonWT, values_from=n, values_fill=0) %>%
  rename(NWT=`non-wt (I/R)`, WT=`wt (S)`) %>%
  full_join(category_phenotype_table)

category_phenotype_table <- category_phenotype_table %>% 
  mutate(N=NWT+WT) %>%
  mutate(`R PPV`=paste0(round(R/N*100,2), "% (N=", R,"/",N,")")) %>%
  mutate(`NWT PPV`=paste0(round(NWT/N*100,2), "% (N=", NWT,"/",N,")"))

category_phenotype_table

Table 2 - summarise classifier by genotype profile

predR_summary <- cipro_antibiogram %>% 
  group_by(predR, resistant) %>% 
  count() %>% 
  ungroup() %>%
  pivot_wider(names_from=resistant, values_from=n, values_fill=0) %>%
  rename(SI=`0`, R=`1`) %>%
  mutate(N=SI+R) %>%
  mutate(PPV = R/N) 

predNWT_summary <- cipro_antibiogram %>% 
  group_by(predNWT, WT_vs_nonWT) %>% 
  count() %>% 
  ungroup() %>%
  pivot_wider(names_from=WT_vs_nonWT, values_from=n, values_fill=0) %>%
  mutate(N=`non-wt (I/R)`+`wt (S)`) %>%
  mutate(PPV = `non-wt (I/R)`/N) 

category_phenotype_table %>% 
  mutate(QRDR=c("0^", "0", "0", "1", "1", ">1", "0", "0", "0", ">0")) %>%
  mutate(PMQR=c("0", "0", "qnrB1", "0", "0", "0", "1^", "1", ">1", ">0")) %>%
  mutate(aac6=c("0", "1", "0", "0", "1", "*", "0", "1", "*", "*")) %>%
  mutate(Prediction=c("WT S", "WT S", "NWT I", rep("NWT R", 7))) %>%
  mutate(N=as.character(N)) %>%
  select(QRDR, PMQR, aac6, Prediction, N, `R PPV`, `NWT PPV`) %>%
  bind_rows(c(QRDR="Overall", PMQR="", aac6="", Prediction="-", N=as.character(nrow(cipro_antibiogram)), 
              `R PPV`=paste0(round(predR_summary[2, "PPV"]*100,2),"% (N=", predR_summary[2,"R"], "/", predR_summary[2,"N"],")"),
              `NWT PPV`=paste0(round(predNWT_summary[2, "PPV"]*100,2),"% (N=", predNWT_summary[2,"non-wt (I/R)"], "/", predNWT_summary[2,"N"],")")
              )) %>%
  write_csv(file="tables/Table2_Classifier.csv")

write(x="\n^=GyrA-87G and GyrA-87H are not included in QRDR count; qnrB1 is excluded from the single-PMQR count. *aac6 presence is not considered", file="tables/Table2_Classifier.csv", append=T)

Fig 4 - rules prediction plots

# MIC distribution R vs. S
rules_MIC_distributionRS <- cipro_antibiogram %>%
  group_by(Laboratory.Typing.Method) %>%
  filter(Laboratory.Typing.Method != "Disk diffusion") %>% 
  mutate(predSIR_label=case_when(predSIR=="S" ~ "WT S", predSIR=="R" ~ "NWT R", TRUE ~ "NWT I")) %>%
  mutate(predSIR_label=factor(predSIR_label, levels=c("WT S", "NWT I", "NWT R"))) %>%
  ggplot(aes(as.factor(round(as.numeric(Measurement), digits=2)), fill = predSIR_label)) + geom_bar() + 
  labs(title = "A) MIC, by classifer prediction", y = "Number of genomes", x="MIC (mg/L)", 
       linetype = "MIC breakpoints", fill="Classifier prediction") +
  # round off x-axis labels to 2dp
  scale_x_discrete(labels = function(x) round(as.numeric(x), digits = 2)) +
  scale_fill_manual(values = res_colours3)+
  geom_vline(aes(xintercept = 5.5), color = "black") +
  annotate("text", x=5.75, y=2900, label="R", size=2.5)+
  geom_vline(aes(xintercept = 4.5), color = "black") +
  annotate("text", x=4.25, y=2900, label="S", size=2.5)+
  theme_minimal()+
  theme(axis.text.x = element_text(angle = 45, hjust=1, size=11))

rules_MIC_distributionRS

# DD distribution
rules_DD_distributionRS <- cipro_antibiogram %>%
  group_by(Laboratory.Typing.Method) %>%
  filter(Laboratory.Typing.Method == "Disk diffusion") %>%
  mutate(predSIR_label=case_when(predSIR=="S" ~ "WT S", predSIR=="R" ~ "NWT R", TRUE ~ "NWT I")) %>%
  mutate(predSIR_label=factor(predSIR_label, levels=c("WT S", "NWT I", "NWT R"))) %>%
  ggplot(aes(as.numeric(Measurement), fill = predSIR_label)) + geom_bar() + 
  labs(title = "B) Disk zones, by classifer prediction", y = "Number of genomes", x ="Disk zone (mm)",
       linetype = "Disk diffusion \nbreakpoints") +
        scale_fill_manual(values = res_colours3)+
  geom_vline(aes(xintercept = 21.5), color = "black") +
  annotate("text", x=20.5, y=400, label="R", size=2.5)+
  geom_vline(aes(xintercept = 24.5), color = "black") +
  #geom_vline(aes(xintercept = 25.5), color = "black", linetype=2) +
  annotate("text", x=25.5, y=400, label="S", size=2.5)+
  theme_minimal() + 
  theme(axis.text.x = element_text(size=11))+
  theme(legend.position = "none")

rules_DD_distributionRS

genotype profile labels and colours

profile_labels <- paste(
  paste0(c("0^", "0", "0", "1", "1", ">1", "0", "0", "0", ">0"), rep(" QRDR,", 10)),
  c("0 PMQR,", "0 PMQR,", "qnrB1,", "0 PMQR,", "0 PMQR,", "0 PMQR", "1 PMQR^,", "1 PMQR,", ">1 PMQR", ">0 PMQR"),
  c("aac6-", "aac6+", "aac6-", "aac6-", "aac6+", "", "aac6-", "aac6+", "", "")
)
names(profile_labels)<-0:9

profile_colours <- c("0" = "#9DDFFE", "1"="#1A83B6", "2"="#98d3a4", "3"="#fbe8a5", "4"="#ecb16f", "5"="#f17c1b", "6"="#fdc0c0", "7"="#e88f8f", "8"="#b92020", "9"="#59124b")
rules_category_SIR_plot <- cipro_antibiogram %>% 
  ggplot(aes(x=as.factor(rules_category), fill=factor(Resistance.phenotype, levels = c("S", "I", "R")))) + 
  geom_bar(position = "fill") + coord_flip() +
  scale_y_continuous(labels = scales::percent)+
  labs(title="C) Observed phenotype distribution, per genotype profile", x="Genotype profile", y="Proportion (%)", fill="Phenotype") +
  scale_fill_manual(values = res_colours)+
  scale_x_discrete(limits=rev, labels=profile_labels) +
  geom_text_repel(aes(label = after_stat(count)), stat = "count", size=3, direction="x", position=position_fill(0.4)) +
  theme_minimal()
  rules_category_SIR_plot

Figure 4 - classifier predictions vs measurements

rules_MIC_distributionRS / rules_DD_distributionRS /rules_category_SIR_plot

ggsave(height=5, width=6, file="figs/Fig4_classifierPred_MIC_DD_distribution.png")

ggsave(height=5, width=6, file="figs/Fig4_classifierPred_MIC_DD_distribution.pdf")

Fig S9 MIC/DD distribution by genotype profile

MIC distribution

rules_MIC_distribution <- cipro_antibiogram %>%
  filter(Laboratory.Typing.Method != "Disk diffusion") %>% 
  ggplot(aes(x = as.factor(round(as.numeric(Measurement), digits=2)), fill = as.factor(rules_category))) + geom_bar() + 
  labs(x = "MIC (mg/L)", y = "Number of genomes", 
       linetype = "MIC breakpoints", title="A) MIC, by genotype profile", fill="Genotype profile") +
  scale_fill_manual(values=profile_colours, labels=profile_labels)+
  # round off x-axis labels to 2dp
  scale_x_discrete(labels = function(x) round(as.numeric(x), digits = 2)) +
  geom_vline(aes(xintercept = 5.5), color = "black") +
  annotate("text", x=5.75, y=2900, label="R", size=2.5)+
  geom_vline(aes(xintercept = 4.5), color = "black") +
  annotate("text", x=4.25, y=2900, label="S", size=2.5)+
  theme_minimal()+
  theme(axis.text.x = element_text(angle = 45, hjust=1, size=11))

rules_MIC_distribution

# DD distribution
rules_DD_distribution <- cipro_antibiogram %>%
  group_by(Laboratory.Typing.Method) %>%
  filter(Laboratory.Typing.Method == "Disk diffusion") %>%
  ggplot(aes(as.numeric(Measurement), fill = as.factor(rules_category))) + geom_bar() + 
  scale_fill_manual(values=profile_colours, labels=profile_labels)+
  geom_vline(aes(xintercept = 21.5), color = "black") +
  annotate("text", x=20.5, y=400, label="R", size=2.5)+
  geom_vline(aes(xintercept = 24.5), color = "black", linetype=2) +
  geom_vline(aes(xintercept = 25.5), color = "black", linetype=2) +
  annotate("text", x=26.5, y=400, label="S", size=2.5)+
  theme_minimal()+
  labs(y = "Number of genomes", x="Disk zone (mm)", title="B) Disk zones, by genotype profile") +
  theme_minimal() + 
  theme(axis.text.x = element_text(size=11))+
  theme(legend.position = "none")

Figure S10 - genotype profiles vs measurements

rules_MIC_distribution / rules_DD_distribution 

ggsave(height=10, width=8, file="figs/FigS10_genotypeProfiles_MIC_DD_distribution.png")

ggsave(height=10, width=8, file="figs/FigS10_genotypeProfiles_MIC_DD_distribution.pdf")

numbers for text - ATU discrepant isolates

dd_atu <- cipro_antibiogram %>% filter(Resistance.phenotype!="R" & predSIR=="R") %>% filter(Laboratory.Typing.Method == "Disk diffusion") %>% filter(as.numeric(Measurement) >21 & as.numeric(Measurement)<24) 

mic_atu <- cipro_antibiogram %>% filter(Resistance.phenotype!="R" & predSIR=="R") %>% filter(Laboratory.Typing.Method != "Disk diffusion") %>% filter(as.numeric(Measurement) ==0.5) 

atu <- bind_rows(mic_atu, mic_atu) 

atu %>% group_by(Flq_acquired, Flq_mutations) %>% count()  %>% mutate(pc=n/nrow(atu))
unique(atu$index)
##  [1] "KASPAH"             "KLEBGAP"            "India_GHRU"        
##  [4] "Leangapichart_2021" "Philippines"        "Egli"              
##  [7] "Long"               "Mathers"            "Italy_MedVetKlebs" 
## [10] "BSAC"               "Milan_2019"         "Colombia_GHRU"     
## [13] "BARNARDS"           "SPARK"              "Nigeria_GHRU"      
## [16] "Stoesser_2013"      "Heinz_2019"         "Bachman_2022"
atu %>% group_by(index) %>% count()
unique(atu$species)
## [1] "Klebsiella pneumoniae"                             
## [2] "Klebsiella quasipneumoniae subsp. quasipneumoniae" 
## [3] "Klebsiella quasipneumoniae subsp. similipneumoniae"
atu %>% group_by(species) %>% count()

numbers for text - unexplained resistance

# number R with no res determinants
cipro_antibiogram %>% filter(Resistance.phenotype=="R" & QRDR_mutations==0 & acquired_genes==0 & aac6==0) %>% nrow()
## [1] 140
cipro_antibiogram %>% filter(Resistance.phenotype=="R") %>% nrow()
## [1] 6177
# assay measures for unexplained R
cipro_antibiogram %>% filter(Resistance.phenotype=="R" & QRDR_mutations==0 & acquired_genes==0 & aac6==0) %>% filter(Laboratory.Typing.Method!="Disk diffusion" & Measurement==1) %>% nrow()
## [1] 53
cipro_antibiogram %>% filter(Resistance.phenotype=="R" & QRDR_mutations==0 & acquired_genes==0 & aac6==0) %>% filter(Laboratory.Typing.Method=="Disk diffusion" & Measurement %in% c(20, 21)) %>% nrow()
## [1] 20
# STs for unexplained R
cipro_antibiogram %>% filter(Resistance.phenotype=="R" & QRDR_mutations==0 & acquired_genes==0 & aac6==0) %>% group_by(ST) %>% count()
cipro_antibiogram %>% filter(Resistance.phenotype=="R" & QRDR_mutations==0 & acquired_genes==0 & aac6==0) %>% group_by(ST) %>% count() %>% nrow()
## [1] 84
# porin mutations in all genomes without QRDR/PMQR/aac6
cipro_antibiogram %>% filter(QRDR_mutations==0 & acquired_genes==0& aac6==0 & Omp_mutations !="-") %>% nrow()
## [1] 94
cipro_antibiogram %>% filter(QRDR_mutations==0 & acquired_genes==0 & aac6==0) %>% nrow()
## [1] 5671
(cipro_antibiogram %>% filter(QRDR_mutations==0 & acquired_genes==0& aac6==0 & Omp_mutations !="-") %>% nrow()) / (cipro_antibiogram %>% filter(QRDR_mutations==0 & acquired_genes==0 & aac6==0) %>% nrow())
## [1] 0.01657556
cipro_antibiogram %>% filter(QRDR_mutations==0 & acquired_genes==0 & aac6==0) %>% group_by(Omp_mutations) %>% count()
cipro_antibiogram %>% filter(QRDR_mutations==0 & acquired_genes==0 & aac6==0) %>% filter(grepl('OmpK35', Omp_mutations))%>% nrow()
## [1] 45
cipro_antibiogram %>% filter(QRDR_mutations==0 & acquired_genes==0 & aac6==0) %>% filter(grepl('OmpK36', Omp_mutations))%>% nrow()
## [1] 53
# genomes without QRDR/PMQR/aac6
noResDet <- cipro_antibiogram %>% filter(QRDR_mutations==0 & acquired_genes==0 & aac6==0) %>% mutate(ompK35=if_else(grepl('OmpK35', Omp_mutations), 1, 0)) %>% mutate(ompK36=if_else(grepl('OmpK36', Omp_mutations), 1, 0))

fisher.test(noResDet$ompK35, noResDet$resistant)
## 
##  Fisher's Exact Test for Count Data
## 
## data:  noResDet$ompK35 and noResDet$resistant
## p-value = 6.934e-09
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##   6.15008 28.58310
## sample estimates:
## odds ratio 
##     13.761
prop.test(table(noResDet$ompK35,noResDet$resistant)[2:1,2:1])
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  table(noResDet$ompK35, noResDet$resistant)[2:1, 2:1]
## X-squared = 82.013, df = 1, p-value < 2.2e-16
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  0.08469031 0.35834006
## sample estimates:
##     prop 1     prop 2 
## 0.24444444 0.02292926
fisher.test(noResDet$ompK36, noResDet$resistant)
## 
##  Fisher's Exact Test for Count Data
## 
## data:  noResDet$ompK36 and noResDet$resistant
## p-value = 0.04102
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.8500308 9.1624131
## sample estimates:
## odds ratio 
##   3.289251
prop.test(table(noResDet$ompK36,noResDet$resistant)[2:1,2:1])
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  table(noResDet$ompK36, noResDet$resistant)[2:1, 2:1]
## X-squared = 3.7993, df = 1, p-value = 0.05127
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  -0.02948782  0.13201541
## sample estimates:
##    prop 1    prop 2 
## 0.0754717 0.0242079
# porin mutations in unexplained R

cipro_antibiogram %>% filter(Resistance.phenotype=="R" & Flq_acquired=="-" & Flq_mutations=="-" & aac6==0) %>% filter(Omp_mutations!="-") %>% nrow()
## [1] 14
cipro_antibiogram %>% filter(Resistance.phenotype=="R" & Flq_acquired=="-" & Flq_mutations=="-" & aac6==0) %>% filter(Omp_mutations!="-") %>% nrow()/ (cipro_antibiogram %>% filter(Resistance.phenotype=="R" & QRDR_mutations==0 & acquired_genes==0 & aac6==0) %>% nrow())
## [1] 0.1
cipro_antibiogram %>% filter(Resistance.phenotype=="R" & Flq_acquired=="-" & Flq_mutations=="-" & aac6==0) %>% group_by(Omp_mutations) %>% count()
# assay measures for unexplained R with porin mutations
cipro_antibiogram %>% filter(Resistance.phenotype=="R" & Flq_acquired=="-" & Flq_mutations=="-" & aac6==0) %>% group_by(Omp_mutations, Measurement, Measurement.units) %>% count() %>% filter(Omp_mutations!="-") %>% arrange(Measurement)

numbers for text - unexplained S

# number S with mutations
S_mut <- cipro_antibiogram %>% filter(Resistance.phenotype=="S" & ((QRDR_mutations==1 & Flq_mutations!="GyrA-87G"& Flq_mutations!="GyrA-87H") | (acquired_genes==1 & !grepl('qnrB1.', Flq_acquired)))) 

S_mut %>% nrow()
## [1] 41
# assay measures for unexplained R
S_mut %>% filter(Laboratory.Typing.Method!="Disk diffusion" & Measurement==0.25) %>% nrow()
## [1] 30
S_mut %>% filter(Laboratory.Typing.Method=="Disk diffusion" & Measurement %in% c(25, 26)) %>% nrow()
## [1] 4
S_mut %>% group_by(ST) %>% count()
S_mut %>% group_by(ST) %>% count()%>% nrow()
## [1] 33
S_mut %>% group_by(Flq_acquired, Flq_mutations) %>% count()
# assay measures for unexplained R with porin mutations
S_mut %>% group_by(Flq_acquired, Flq_mutations, Measurement, Measurement.units) %>% count() %>% arrange(Measurement)

MIC regression models

MIC_vs_dets <- cipro_antibiogram %>% 
  filter(Laboratory.Typing.Method!="Disk diffusion") %>%
  select(`GyrA-83F`:`qnrVC6`, aac6, Measurement) %>%
  rename(`aac(6')-Ib-cr`=aac6) %>%
  select(-c(ends_with('?'), ends_with('*'))) # remove imprecise matches

# exclude genomes with rare variants
MIC_vs_dets <- MIC_vs_dets[,colSums(MIC_vs_dets) >= 20]

# model MIC vs all common variants (N>=20) = Model 1a
model_MIC_indiv <- lm(log2(Measurement) ~ ., data=MIC_vs_dets)

# model R vs all common variants (N>=20), each with aac6 interaction = Model 1b
model_MIC_indiv_aac6Interaction <- lm(log2(Measurement) ~ .*`aac(6')-Ib-cr`, data=MIC_vs_dets)

Linear regression models

plot MIC linear regression coefficients for each determinant

# summarise model output
linreg_MIC_model1a <- linreg_details(model_MIC_indiv) %>% mutate(interaction=NA) %>% mutate(model="a") %>% relocate(interaction, .after=Determinant) %>%
    mutate(Determinant=gsub("`","",Determinant)) 
linreg_MIC_model1b <- linreg_details(model_MIC_indiv_aac6Interaction) %>% 
  rename(Term=Determinant) %>%
  separate_wider_delim(Term, ":", names=c("Determinant","interaction"), too_few="align_start", cols_remove=F) %>% 
  mutate(model="b") %>%
  mutate(Determinant=gsub("`","",Determinant))  %>%
  mutate(Term=gsub("`","",Term))  %>%
  mutate(interaction=gsub("`","",interaction)) 

# all individual determinants, plus significant positive interactions
lin_reg_MIC_plot <- linreg_MIC_model1a %>% bind_rows(linreg_MIC_model1b) %>%
    filter((pval<0.05 & est>0) | is.na(interaction)) %>%
    mutate(interaction=factor(replace(interaction, is.na(interaction), "none"), levels=c("none", "aac(6')-Ib-cr"))) %>%
    filter(Determinant != "(Intercept)") %>%
    ggplot(aes(y=Determinant)) + 
    geom_vline(xintercept=0) + 
    geom_linerange(aes(xmin=ci.lower, xmax=ci.upper, group=model, col=model, linetype=interaction), position=pd) +
    geom_point(aes(x=est, group=model, col=model), position=pd) + 
    scale_colour_manual(values=c(a="grey", b="black")) +
    theme_bw() + labs(x="Regression coefficient", col="Model", linetype="Interaction")

lin_reg_MIC_plot

disk diffusion linear regression

DD_vs_dets <- cipro_antibiogram %>% 
  filter(Laboratory.Typing.Method=="Disk diffusion") %>%
  select(`GyrA-83F`:`qnrVC6`, aac6, Measurement) %>%
  rename(`aac(6')-Ib-cr`=aac6) %>%
  select(-c(ends_with('?'), ends_with('*'))) # remove imprecise matches

# exclude genomes with rare variants
DD_vs_dets <- DD_vs_dets[,colSums(DD_vs_dets) >= 20]

# model MIC vs all common variants (N>=20) = Model 1a
model_DD_indiv <- lm(Measurement ~ ., data=DD_vs_dets)

# model R vs all common variants (N>=20), each with aac6 interaction = Model 1b
model_DD_indiv_aac6Interaction <- lm(Measurement ~ .*`aac(6')-Ib-cr`, data=DD_vs_dets)

plot MIC linear regression coefficients for each determinant

# summarise model output
linreg_DD_model1a <- linreg_details(model_DD_indiv) %>% mutate(interaction=NA) %>% mutate(model="a") %>% relocate(interaction, .after=Determinant) %>%
    mutate(Determinant=gsub("`","",Determinant)) 
linreg_DD_model1b <- linreg_details(model_DD_indiv_aac6Interaction) %>% 
  rename(Term=Determinant) %>%
  separate_wider_delim(Term, ":", names=c("Determinant","interaction"), too_few="align_start", cols_remove=F) %>% 
  mutate(model="b") %>%
  mutate(Determinant=gsub("`","",Determinant))  %>%
  mutate(Term=gsub("`","",Term))  %>%
  mutate(interaction=gsub("`","",interaction)) 

# all individual determinants, plus significant positive interactions
lin_reg_DD_plot <- linreg_DD_model1a %>% bind_rows(linreg_DD_model1b) %>%
    filter((pval<0.05 & est<0) | is.na(interaction)) %>%
    mutate(interaction=replace(interaction, is.na(interaction), "-")) %>%
    filter(Determinant != "(Intercept)") %>%
    ggplot(aes(y=Determinant)) + 
    geom_vline(xintercept=0) + 
    geom_linerange(aes(xmin=ci.lower, xmax=ci.upper, group=model, col=model, linetype=interaction), position=pd) +
    geom_point(aes(x=est, group=model, col=model), position=pd) + 
    scale_colour_manual(values=c(a="grey", b="black")) +
    theme_bw() + labs(x="Regression coefficient", col="Model", linetype="Interaction") + 
  theme(legend.position="none")

lin_reg_DD_plot

lin_reg_MIC_plot + ggtitle("MIC model") + lin_reg_DD_plot + ggtitle("Disk diffusion zone model") + plot_layout(axes="collect", guides="collect")

ggsave(width=8, height=5, file="figs/FigX_regressionMIC_DD_Models.pdf")
ggsave(width=8, height=5, file="figs/FigX_regressionMIC_DD_Models.png")
cipro_antibiogram %>% filter(Laboratory.Typing.Method!="Disk diffusion") %>% ggplot(aes(x=factor(rules_category),y=Measurement)) + geom_boxplot() + scale_y_continuous(trans = log2_trans(), breaks = 2^(-5:9), labels = function(x) round(as.numeric(x), digits = 3))

MIC distribution - classifier profiles

rules_MIC_distribution_separate <- cipro_antibiogram %>%
  filter(Laboratory.Typing.Method != "Disk diffusion") %>% 
  mutate(profiles=case_when(rules_category==0 ~ profile_labels["0"],
                                  rules_category==1 ~ profile_labels["1"],
                                  rules_category==2 ~ profile_labels["2"],
                                  rules_category==3 ~ profile_labels["3"],
                                  rules_category==4 ~ profile_labels["4"],
                                  rules_category==5 ~ profile_labels["5"],
                                  rules_category==6 ~ profile_labels["6"],
                                  rules_category==7 ~ profile_labels["7"],
                                  rules_category==8 ~ profile_labels["8"],
                                  rules_category==9 ~ profile_labels["9"],
                                  TRUE ~ NA
                                  )) %>%
  mutate(profiles=fct_relevel(profiles, profile_labels)) %>%
  ggplot(aes(x = as.factor(round(as.numeric(Measurement), digits=2)), 
             fill = profiles)) + 
  geom_bar() + 
  labs(x = "MIC (mg/L)", y = "Number of genomes") +
  scale_fill_manual(values=unname(profile_colours))+
  # round off x-axis labels to 2dp
  scale_x_discrete(labels = function(x) round(as.numeric(x), digits = 2)) +
  geom_vline(aes(xintercept = 5.5), color = "black") +
  geom_vline(aes(xintercept = 4.5), color = "black") +
  theme_minimal()+
  theme(axis.text.x = element_text(angle = 45, hjust=1, size=8), 
        axis.text.y = element_text(size=7),
        legend.position="none") + 
  facet_wrap(~profiles, scales="free_y", ncol=1)

rules_MIC_distribution_separate

expected MIC value per category

discrete_quantile <- function(x, p) {
  sort(x)[length(x)*p]
}

cipro_antibiogram <- cipro_antibiogram %>%
  mutate(profiles=case_when(rules_category==0 ~ profile_labels["0"],
                                  rules_category==1 ~ profile_labels["1"],
                                  rules_category==2 ~ profile_labels["2"],
                                  rules_category==3 ~ profile_labels["3"],
                                  rules_category==4 ~ profile_labels["4"],
                                  rules_category==5 ~ profile_labels["5"],
                                  rules_category==6 ~ profile_labels["6"],
                                  rules_category==7 ~ profile_labels["7"],
                                  rules_category==8 ~ profile_labels["8"],
                                  rules_category==9 ~ profile_labels["9"],
                                  TRUE ~ NA
                                  )) %>%
  mutate(profiles=fct_relevel(profiles, profile_labels))

MIC_pred <- cipro_antibiogram %>%
  filter(Laboratory.Typing.Method != "Disk diffusion") %>% 
  group_by(profiles) %>% 
  summarise(median=median(Measurement),  
            lower25=discrete_quantile(as.numeric(Measurement), p=0.25), 
            upper25=discrete_quantile(as.numeric(Measurement), p=0.75)) %>%
  mutate(IQR=paste0(median, " [", lower25, "-", upper25, "]"))

MIC_pred
write_tsv(MIC_pred, file="tables/TableSX_classifier_MICpred.tsv")

FIGURE 5 - MIC distribution - classifier profiles

rules_MIC_distribution_separate <- cipro_antibiogram %>%
  filter(Laboratory.Typing.Method != "Disk diffusion") %>% 
  left_join(MIC_pred) %>%
  mutate(y=case_when(rules_category==0 ~ 1000,
                                  rules_category==1 ~ 50,
                                  rules_category==2 ~ 25,
                                  rules_category==3 ~ 15,
                                  rules_category==4 ~ 4,
                                  rules_category==5 ~ 500,
                                  rules_category==6 ~ 100,
                                  rules_category==7 ~ 200,
                                  rules_category==8 ~ 10,
                                  rules_category==9 ~ 1000,
                                  TRUE ~ NA
                                  )) %>%
  ggplot(aes(x = as.factor(round(as.numeric(Measurement), digits=2)), 
             fill = profiles)) + 
  geom_bar() + 
  #geom_text(aes(label=IQR, y=y), x=as.factor("256"), check_overlap = T, size=3) +
  labs(x = "MIC (mg/L)", y = "Number of genomes") +
  scale_fill_manual(values=c("LightBlue", "LightBlue", "#78638a", rep("IndianRed", 7)))+
  # round off x-axis labels to 2dp
  scale_x_discrete(labels = function(x) round(as.numeric(x), digits = 2)) +
  geom_vline(aes(xintercept = 5.5), color = "black") +
  geom_vline(aes(xintercept = 4.5), color = "black") +
  theme_minimal()+
  theme(axis.text.x = element_text(angle = 45, hjust=1, size=8), 
        axis.text.y = element_text(size=7),
        legend.position="none") + 
  facet_wrap(~profiles, scales="free_y", ncol=1)

rules_MIC_distribution_separate

rules_MIC_distribution_separate

ggsave(width=5, height=7, file="figs/Fig5_MICdist_byProfile.pdf")
ggsave(width=5, height=7, file="figs/Fig5_MICdist_byProfile.png")

disk zone distribution - classifier profiles

rules_DD_distribution_separate <- cipro_antibiogram %>%
  filter(Laboratory.Typing.Method == "Disk diffusion") %>% 
  mutate(profiles=case_when(rules_category==0 ~ profile_labels["0"],
                                  rules_category==1 ~ profile_labels["1"],
                                  rules_category==2 ~ profile_labels["2"],
                                  rules_category==3 ~ profile_labels["3"],
                                  rules_category==4 ~ profile_labels["4"],
                                  rules_category==5 ~ profile_labels["5"],
                                  rules_category==6 ~ profile_labels["6"],
                                  rules_category==7 ~ profile_labels["7"],
                                  rules_category==8 ~ profile_labels["8"],
                                  rules_category==9 ~ profile_labels["9"],
                                  TRUE ~ NA
                                  )) %>%
  mutate(profiles=fct_relevel(profiles, profile_labels)) %>%
  ggplot(aes(x = as.numeric(Measurement), fill = profiles)) + 
  geom_bar() + 
  labs(y = "Number of genomes", 
       x="Disk zone (mm)", 
       title="B) Disk zones, by genotype profile") +
  scale_fill_manual(values=unname(profile_colours))+
  # round off x-axis labels to 2dp
  scale_x_discrete(labels = function(x) round(as.numeric(x), digits = 2)) +
  geom_vline(aes(xintercept = 21.5), color = "black") +
  geom_vline(aes(xintercept = 24.5), color = "black", linetype=2) +
  geom_vline(aes(xintercept = 25.5), color = "black", linetype=2) +
  theme_minimal()+
  theme(axis.text.x = element_text(angle = 45, hjust=1, size=10), 
        legend.position="none") + 
  facet_wrap(~profiles, scales="free_y", ncol=2)

rules_DD_distribution_separate